#!/usr/bin/python

import sys,os,re,glob,math,glob,signal,traceback
import matplotlib
from collections import Counter
import argparse
from math import log

parser = argparse.ArgumentParser(description = """
Computes recognition lattices for text lines.  Also displays the bestpath
result (recognition result without language model).
""")
parser.add_argument("args",default=[],nargs='*',help="input lines")
parser.add_argument("-s","--wordseps",action="store_true",help="add word separator indicators")
parser.add_argument("-f","--factor",type=float,default=1.0,help="cost factor")
parser.add_argument("-F","--sfactor",type=float,default=None,help="cost factor for separators")
parser.add_argument("-o","--output",default="dict.fst")
# remove prior cost
args = parser.parse_args()

if args.sfactor is None: args.sfactor = args.factor

words = Counter()
seps = Counter()

for arg in args.args:
    with open(arg) as stream:
        for line in stream.readlines():
            line = line[:-1]
            fields = re.split("([\w'-]+)",line)
            for f in fields[::2]:
                if f=="": continue
                if re.search('[\000-\037]',f): continue
                seps[f] += 1
            for f in fields[1::2]:
                if f=="": continue
                if re.search('[\000-\037]',f): continue
                words[f] += 1

nwords = sum(words.values())
nseps = sum(seps.values())
print nseps,nwords

import openfst
def Fst():
    return openfst.StdVectorFst()

wordfst = Fst()
print "wordfst"
for k,v in sorted(words.most_common(),key=lambda x:-x[1]):
    c = -log(v*1.0/nwords)
    # print "%8.3f\t%s"%(c,k)
    wordfst.AddString(k,c*args.factor)

print "sepfst"
sepfst = Fst()
for k,v in sorted(seps.most_common(10),key=lambda x:-x[1]):
    c = -log(v*1.0/nseps)
    # print "%8.3f\t%s"%(c,k)
    sepfst.AddString(k,c*args.sfactor)

print "concat"
indicatorfst = Fst()
indicatorfst.AddTranslation("","|")
if args.wordseps: openfst.ConcatOnto(wordfst,indicatorfst)
openfst.ConcatOnto(wordfst,sepfst)
if args.wordseps: openfst.ConcatOnto(wordfst,indicatorfst)
print "closure"
openfst.ClosurePlus(wordfst)
det = Fst()
print "determinize"
openfst.Determinize(wordfst,det)
print "minimize"
openfst.Minimize(det)
print "writing"
det.Write(args.output)
